# -*- coding: utf-8 -*-
"""
Created on Sat Nov  4 21:56:51 2017

@author: PWOLFF
"""

import numpy as np
import glob

def make_vector_dic(vectors_filename):
    vector_dic = {}
    with open(vectors_filename,'r') as infile:
        for row in infile:
            row = row.rstrip('\n')
            row = row.rstrip('\t')
            line = row.split('\t')
            vector = []
            for i in range(1,len(line)):
                vector.append(float(line[i]))
            vector = vector / np.linalg.norm(vector)
            vector_dic[line[0]] = vector
    return vector_dic

def split_entry(entrystr):
    verb = entrystr[:entrystr.find(' ')]
    weight = float(entrystr[entrystr.find(' ')+1:])
    return [verb, weight]

def readin_components(component_file):
    vec_dic = {}
    fit_dic = {}
    with open(component_file,"r") as infile:
        for row in infile:
            line = row.split('\t')
            if line[0] not in vec_dic:
                vec_dic[line[0]] = []
            for i in range(2,len(line)):
                if line[i] != '' and line[i] != '\n': # and split_entry(line[i])[0][0:2] == 'IN':
                    vec_dic[line[0]].append(split_entry(line[i]))
            fit_dic[line[0]] = float(line[1])
    return vec_dic, fit_dic

def partition_components(vec_comp_dic):
    weight_dic = {}
    partition_dic = {}
    ideosyncratic_component = {}
    keys = list(vec_comp_dic.keys())
    for key in keys:
        weights = []
        for i in range(len(vec_comp_dic[key])):
            weights.append(vec_comp_dic[key][i][1])
        weight_dic[key] = weights
        hi_i = 0
        hi_F = 0
        for i in range(2,len(weight_dic[key])-1):
            sample1 = np.array(weight_dic[key][0:i])
            sample2 = np.array(weight_dic[key][i:])
            BTW_var = np.array([sample1.mean(),sample2.mean()]).var()

            sample1_var = sample1.var()
            sample2_var = sample2.var()
            df1 = len(sample1)-1
            df2 = len(sample2)-1
            pooled_var = (df1*sample1_var + df2*sample2_var) / (df1+df2)
            F = BTW_var/pooled_var
            if F > hi_F:
                hi_F = F
                hi_i = i
        #print(F, hi_i)            
        partition_dic[key] = hi_i
        ideosyncratic_component[key] = len(weight_dic[key]) - hi_i
    num_core_components_list = []
    for key in partition_dic:
        num_core_components_list.append(partition_dic[key])
    mean_num_core_components = np.array(num_core_components_list).mean()
    
    num_idiosyncrative_comp_list = []
    for key in ideosyncratic_component:
        num_idiosyncrative_comp_list.append(ideosyncratic_component[key])
    mean_num_ideosyncratic = np.array(num_idiosyncrative_comp_list).mean()
    
    return mean_num_core_components, mean_num_ideosyncratic, partition_dic

def vectorize(vec_comp_dic,partition_dic,vector_dic):
    sent_core_vectors  = {}
    for key in vec_comp_dic:
        core_vec = np.array([0]*200)
        for i in range(partition_dic[key]):
            if vec_comp_dic[key][i][0] in vector_dic:
                core_vec = core_vec + np.array(vector_dic[vec_comp_dic[key][i][0]]) * vec_comp_dic[key][i][1]
        sent_core_vectors[key] = core_vec / np.linalg.norm(core_vec) 
    return sent_core_vectors

def build_core_component_dic(vec_comp_dic,partition_dic):
    core_vec_comp_dic = {}
    for key in vec_comp_dic:
        for i in range(partition_dic[key]):
            if key not in core_vec_comp_dic:
                core_vec_comp_dic[key] = [vec_comp_dic[key][i]]
            else:
                core_vec_comp_dic[key].append(vec_comp_dic[key][i])
    return core_vec_comp_dic

def save_ttest_dic(mean_dic,ideo_dic,outfileName):
    outfile = open(outfileName,'w')    
    keys = list(mean_dic.keys())
    keys.sort()
    for key in keys:
        outfile.write(str(key)+','+str(mean_dic[key])+','+str(ideo_dic[key])+'\n') 
   
def save_vectors(sent_core_vectors,outvecfilename):
    outfile = open(outvecfilename,'w')    
    keys = list(sent_core_vectors.keys())
    keys.sort()
    for key in keys:
        numstr = ''
        for i in range(len(sent_core_vectors[key])):
            numstr = numstr + str(sent_core_vectors[key][i]) + '\t'
        numstr = numstr.rstrip('\t')
        outfile.write(str(key)+'\t'+numstr+'\n')
        
def save_core_componets(core_vec_comp_dic,fit_dic,outfileName):
    outfile = open(outfileName.replace('.txt','.tac'),'w')
    keys = list(core_vec_comp_dic.keys())
    keys.sort()
    for key in keys:
        rowstr = ''
        for i in range(len(core_vec_comp_dic[key])):
            rowstr = rowstr + str(core_vec_comp_dic[key][i][0]) + ' ' + str(core_vec_comp_dic[key][i][1]) + '\t'
        rowstr = rowstr.rstrip('\t')
        outfile.write(str(key) + '\t' + str(fit_dic[key]) + '\t' + rowstr + '\n')

################  Begin here #################

#The program takes a file of semantic components with weights. It outputs a 'TAC'file that represents the primary components
#The program also outputs a 'VEC' file that represents a re-vectorization of the components using their weights. 

#input_files = glob.glob('*.tab')
input_files = ['nyt_d200_w5_13592_comp.txt']
vector_input_filename = 'nyt_d200_w5_13592_1.txt'
outfileName = 'nyt_d200_w5_13592_comp_core.csv'



mean_dic = {}
ideo_dic = {}

vector_dic = make_vector_dic(vector_input_filename)
    
for i in range(len(input_files)):
    space_num = input_files[i].find(' ')
    score_num = input_files[i].find('_')
    #IDnum = int(input_files[i][space_num:score_num])  
    IDnum = 1
    component_file = input_files[i] 
    vec_comp_dic, fit_dic = readin_components(component_file)  #vec_comp_dic has components + weights for each verb
    mean_num_core_components, mean_num_ideosyncratic, partition_dic = partition_components(vec_comp_dic)
    mean_dic[IDnum] = mean_num_core_components
    ideo_dic[IDnum] = mean_num_ideosyncratic
    
    sent_core_vectors = vectorize(vec_comp_dic,partition_dic,vector_dic)
    core_vec_comp_dic = build_core_component_dic(vec_comp_dic,partition_dic)
    outvecfilename = component_file.replace('.txt','.vec')
    save_vectors(sent_core_vectors,outvecfilename)
    save_core_componets(core_vec_comp_dic,fit_dic,input_files[i])

#save_ttest_dic(mean_dic,ideo_dic,outfileName)